Overview

There is mainly 3 parts to this story:

  1. A simple easy to communicate model of the key relationship

  2. A medium complexity smoothing analysis

  3. A full power time series analysis with causal inference

The two data set used in this analysis are the Madison case and waste water concentration data.

##           Date    Site Cases    N1 N1Error
## 435 2021-06-19 Madison     3    NA      NA
## 436 2021-06-20 Madison     8 49443    8812
## 437 2021-06-21 Madison     1 90447   20429
## 438 2021-06-22 Madison     3 44587   12427
## 439 2021-06-23 Madison     4 14469    9155
## 440 2021-06-24 Madison    NA 28023    7724

A simple display of the data shows the core components of this story. First that both data sets are extremely noisy. And that there is a hint of a relationship between the two signals.

MinMaxNorm <- function(Vec){#normalizes the data to range from 0 and 1
  normVec <- (Vec-min(Vec,na.rm=TRUE))/max(Vec,na.rm=TRUE)
  return(normVec)
}

NoNa <- function(DF,...){#Removes NA from the reverent columns
  ColumnNames <- c(...)
  NoNaDF <- DF%>%
    filter(
      across(
      .cols = ColumnNames,
      .fns = ~ !is.na(.x))
      )
  return(NoNaDF)
}
FillNA <- function(DF,...){#Fills NA with previous values
  ColumnNames <- c(...)
  NoNaDF <- DF%>%
    fill(ColumnNames)
  return(NoNaDF)
}

FirstImpression <- FullDF%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNorm(N1), color="N1",info=N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNorm(Cases), color="Cases",info=Cases))+
  labs(y="variable min max normalized")

ggplotly(FirstImpression,tooltip=c("info","Date"))

Cross correlation and Granger Causality are key component to this analysis. Cross correlation looks at the correlation at a range of time shifts and Granger analysis preform a test for predictive power. We find that there is to much noise to find significance.

Cases <- FullDF$Cases
N1 <- FullDF$N1

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F Pr(>F)
## 1    168                 
## 2    169 -1 1.4783 0.2257
grangertest(N1,Cases, order = 1)
## Granger causality test
## 
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
##   Res.Df Df     F Pr(>F)
## 1    168                
## 2    169 -1 4e-04 0.9837
IntermediateOutlierGraphic <- FALSE

DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend

FullDF2 <- FullDF%>%
  mutate(N1 = ifelse(Date < mdy("11/20/2020"),NA,N1))

FullDF3 <- FullDF2%>%#Remove older data that clearly has no relationship to Cases
  mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#Finding very smooth version of the data with no outliers
         SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1),#Fixing issue where rollapply fills NA on right border
         LargeError=N1>1.5*SmoothN1,#Calculating error Limits
         N1=ifelse(LargeError,SmoothN1,N1))%>%#replacing data points that variance is to large
  select(-SmoothN1,-LargeError)#Removing unneeded calculated columns

if(IntermediateOutlierGraphic){
  OutlierGraphic <- FullDF%>%
    mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#creating smooth data
           SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1))%>%#Fixing issue where rollapply fills NA on right border)%>%
    mutate(Outliers=Date < mdy("11/20/2020")|N1>1.5*SmoothN1)%>%
    NoNa("N1","Cases")%>%#Removing outliers
    ggplot(aes(x=Date))+#Data depends on time
    geom_point(aes(y=MinMaxNorm(N1), color="N1",shape=Outliers,info=N1))+#compares N1 to Cases
    geom_point(aes(y=MinMaxNorm(Cases), color="Cases",info=Cases))+
    labs(y="variable min max normalized")
  ggplotly(OutlierGraphic,tooltip=c("info","Date"))
}


FullDF3%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNorm(N1), color="N1"))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNorm(Cases), color="Cases"))+
  labs(y="variable min max normalized")
library(tseries)

TestDF2 <- FullDF3%>%
  FillNA("N1","Cases")%>%
  NoNa("N1","Cases")


Cases <- TestDF2$Cases
N1 <- TestDF2$N1

# kpss.test(Cases)
# adf.test(Cases)
# 
# kpss.test(N1)
# adf.test(N1)

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
grangertest(N1,Cases, order = 1)

A key component to this relationship is that the relationship between N1 and Case involves a gamma distribution modeling both the time between catching Covid-19 and getting a test and the concentration of the shedded particles. We found a gamma distribution with mean 11.73 days and a sd of 7.68 match’s other research and gives good results.

SLDWidth <- 21
scale  <- 5.028338
shape  <- 2.332779 #These parameters are equivalent to the mean and sd above

weights <- dgamma(1:SLDWidth, scale = scale, shape = shape)
plot(weights,  
            main=paste("Gamma Distribution with mean = 11.73 days,and SD = 7.68"), 
            ylab = "Weight", 
            xlab = "Lag")

SLDSmoothedDF <- FullDF3%>%
  mutate(
    SLDCases = c(rep(NA,SLDWidth-1),#elimination of starting values not relevant as we have a 50+ day buffer of case data
                        rollapply(Cases,width=SLDWidth,FUN=weighted.mean,
                                  w=weights,
                                  na.rm = FALSE)))#no missing data to remove


SLDSmoothedDF%>%
  NoNa("N1","SLDCases")%>%#same plot as earlier but with the SLD smoothing
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNorm(SLDCases), color="SLDCases"))+
  geom_line(aes(y=MinMaxNorm(N1), color="N1"))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")

The SLD Cause a signfigent improvement in the relationship now showing a statisticly signifigent test result

TestDF3 <- SLDSmoothedDF%>%
  FillNA("N1","SLDCases")%>%
  NoNa("N1","SLDCases")

SLDCases <- TestDF3$SLDCases
N1 <- TestDF3$N1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F   Pr(>F)   
## 1    279                      
## 2    280 -1 8.3701 0.004115 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 16.616 5.974e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To isolate this relationship we used a primitive binning relationship. This clarifies the relationship we see hints of in the previous graphic and masks the noise in the data.

medianMean <- function(Vec){
  return(mean(replace(Vec, c(which.min(Vec), which.max(Vec)), NA), na.rm = TRUE))
}

StartDate <- 1
DaySmoothing <- 14
Lag <- 4

BinDF <- SLDSmoothedDF%>%
  select(Date, SLDCases, N1)%>%
    mutate(MovedCases = data.table::shift(SLDCases, Lag), 
           Week=(as.numeric(Date)+StartDate)%/%DaySmoothing)%>%
  group_by(Week)%>%
  #filter(Week>2670)%>%
  summarise(BinnedCases=mean(MovedCases, na.rm=TRUE), BinnedN1=exp(mean(log(N1), na.rm=TRUE)))



BinDF%>%
  ggplot()+
  geom_line(aes(x=Week, y=MinMaxNorm(BinnedN1), color="N1"))+
  geom_line(aes(x=Week, y=MinMaxNorm(BinnedCases), color="Cases"))+
  labs(y="Binned variable min max normalized")

BinDF%>%
  ggplot()+
  geom_point(aes(x=BinnedCases, y=BinnedN1))

cor(BinDF$BinnedN1, BinDF$BinnedCases, use="pairwise.complete.obs")
## [1] 0.5280613
summary(lm(BinnedCases~BinnedN1, data=BinDF))
## 
## Call:
## lm(formula = BinnedCases ~ BinnedN1, data = BinDF)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96.41 -48.94 -32.55  54.48 165.36 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 4.130e+01  2.631e+01   1.570   0.1322  
## BinnedN1    4.650e-04  1.672e-04   2.781   0.0115 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.64 on 20 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.2788, Adjusted R-squared:  0.2428 
## F-statistic: 7.733 on 1 and 20 DF,  p-value: 0.01153

To generate this relationship without reducing the amount of data we rely on a loess smoothing of the data. The loess smoothing is a way of generating smooth curves from noisy data. The displayed plot shows the visual power of this smoothing. We see a relationship in the big patterns but also multiple sub patterns match. We see in general that smoothed N1 both lags and leads the case data.

SLDSmoothedDF$loessN1 <- loessFit(y=(SLDSmoothedDF$N1), 
                      x=SLDSmoothedDF$Date, 
                      span=.2, 
                      iterations=2)$fitted


SLDSmoothedDF%>%
  NoNa("loessN1","SLDCases")%>%
  ggplot()+
  geom_line(aes(x=Date, y=MinMaxNorm(loessN1), color="loessN1"))+
  geom_line(aes(x=Date, y=MinMaxNorm(SLDCases), color="SLDCases"))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")

The loess smoothing roughly doubled the correlation at all time lags. The two granger tests p value also got signifigently smaller

TestDF4 <- SLDSmoothedDF%>%
  FillNA("loessN1","SLDCases")%>%
  NoNa("loessN1","SLDCases")

SLDCases <- TestDF4$SLDCases
N1 <- TestDF4$loessN1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 89.183 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 69.758 3.177e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1